Parks and recreation was a television comedy show that aired on NBC from 2009 until 2015. I obtained the complete transcripts and performed text analysis on the dialogue of the show. Link to data.
file_names <- list.files(here("scripts")) # file names for each episode
parks <- str_glue("scripts/{file_names}") %>%
map_dfr(read_csv) # read in all the episodes into one data frame!
# Tokenize lines to one word in each row
parks_token <- parks %>%
clean_names() %>%
unnest_tokens(word, line) %>% # tokenize
anti_join(stop_words) %>% # remove stop words
mutate(word = str_extract(word, "[a-z']+")) %>% # extract words only
drop_na(word) # take out missing values
# Filter the top 10 characters with the most words
top_characters <- parks_token %>%
dplyr::filter(character != "Extra") %>%
count(character, sort = TRUE) %>%
slice_max(n, n = 10)
# Obtain words only from the top 10 characters
parks_words <- parks_token %>%
inner_join(top_characters) %>%
filter(!word %in% c("hey", "yeah", "gonna")) %>%
select(-n) %>%
count(word, character, sort = TRUE) %>%
ungroup() %>%
group_by(character) %>%
top_n(9) # top 9
# Sample of a few lines from the show
parks %>%
slice(sample(1:65942, 20)) %>%
kbl(caption = "<b style = 'color:white;'>
Sample of a few randomly chosen lines from Parks and Recreation") %>%
kable_material_dark(bootstrap_options = c("striped", "hover")) %>%
scroll_box(width = "100%", height = "300px",
fixed_thead = list(enabled = T, background = "#222222"))
| Character | Line |
|---|---|
| Ben Wyatt | Can you actually do this? |
| Leslie Knope | You’re never going to be able to pull it off. |
| Leslie Knope | Yes. |
| Leslie Knope | Well, they couldn’t make a banner in time for my press conference, so I am cobbling together a banner from all the other discarded banners. |
| Ann Perkins | We’re moving to a new city. |
| Andy Dwyer | I want nickels. |
| Ben Wyatt | And I didn’t really like it. |
| Tom Haverford | I’m good at this stuff. |
| Donna Meagle | One for pleasure, and one to cap off a long con I was running against Keith Sweat. |
| Chance Frenlm | Topless park! |
| Leslie Knope | I’m gonna call Kaboom, and check on their availability for next year. |
| Ben Wyatt | Now, in this town it’s bigger than the super bowl. |
| Ann Perkins | What the hell happened? |
| Ben Wyatt | Oh. |
| Tom Haverford | Excuse me! |
| Donna Meagle | When did you make the switch to boxer briefs? |
| Linda Lonegan | My Uncle Lon once tried to |
| Leslie Knope | Did I hit your boob? |
| Leslie Knope | So first up, meet and greet. |
| Leslie Knope | He doesn’t know anybody in town. |
ggplot(data = parks_words,
aes(x = n, y = word, fill = n)) +
geom_col(show.legend = FALSE) +
scale_fill_viridis_c(option = "plasma") +
facet_wrap(~character, scales = "free") +
theme_brooklyn99() +
theme(panel.grid.major.y = element_blank(),
axis.text.x = element_text(size = 9),
axis.text.y = element_text(size = 10),
axis.title = element_blank(),
panel.grid.minor = element_blank(),
strip.text = element_text(color = "white",
face = "bold",
size = 10.5))
# Ron Swanson
swanson_words <- parks_token %>%
filter(character == "Ron Swanson") %>% # filter for character
filter(!word %in% c("hey", "yeah", "gonna")) %>% # remove some more stopwords
count(word) %>%
slice_max(n,n = 25) # choose top 25 words
swanson_pic <- jpeg::readJPEG(here("images","ron_swanson.jpg"))
swanson_cloud <- ggplot(data = swanson_words,
aes(label = word)) +
background_image(swanson_pic) + # add image of character
geom_text_wordcloud(aes(size = n),
color = "turquoise1",
shape = "circle") +
scale_size_area(max_size = 6) +
theme_void()
# Lesile Knope
knope_words <- parks_token %>%
filter(character == "Leslie Knope") %>%
filter(!word %in% c("hey", "yeah", "gonna")) %>% # remove some more stopwords
count(word) %>%
slice_max(n,n = 25)
knope_pic <- jpeg::readJPEG(here("images", "knope.jpg"))
knope_cloud <- ggplot(data = knope_words,
aes(label = word)) +
background_image(knope_pic) +
geom_text_wordcloud(aes(size = n),
color = "turquoise1",
shape = "star") +
scale_size_area(max_size = 6) +
theme_void()
# April Ludgate
april_words <- parks_token %>%
filter(character == "April Ludgate") %>%
filter(!word %in% c("hey", "yeah", "gonna")) %>% # remove some more stopwords
count(word) %>%
slice_max(n,n = 25)
april_pic <- jpeg::readJPEG(here("images", "april.jpeg"))
april_cloud <- ggplot(data = april_words,
aes(label = word)) +
background_image(april_pic) +
geom_text_wordcloud(aes(size = n),
color = "turquoise1",
shape = "triangle-upright") +
scale_size_area(max_size = 6) +
theme_void()
# Andy Dwyer
andy_words <- parks_token %>%
filter(character == "Andy Dwyer") %>%
filter(!word %in% c("hey", "yeah", "gonna")) %>% # remove some more stopwords
count(word) %>%
slice_max(n,n = 25)
andy_pic <- jpeg::readJPEG(here("images", "andy.jpg"))
andy_cloud <- ggplot(data = andy_words,
aes(label = word)) +
background_image(andy_pic) +
geom_text_wordcloud(aes(size = n),
color = "turquoise1",
shape = "diamond") +
scale_size_area(max_size = 6) +
theme_void()
# Final patcwork wordcloud
patchwork <- (andy_cloud + april_cloud) / (knope_cloud + swanson_cloud)
patchwork & theme(plot.background = element_rect(fill = "#222222",
color = "#222222"),
strip.background = element_rect(fill = "#222222",
color = "#222222"))
parks <- str_glue("scripts/{file_names}") %>%
map_dfr(read_csv) # read in all the episodes into one data frame!
file_names_season <- str_sub(file_names, start = 3L)
# which(file_names_season == "e01.csv")
# season_1 <- str_glue("scripts/{file_names[1:6]}") %>%
# map_dfr(read_csv)
#
# season_2 <- str_glue("scripts/{file_names[7:30]}") %>%
# map_dfr(read_csv)
#
# season_3 <- str_glue("scripts/{file_names[31:46]}") %>%
# map_dfr(read_csv)
season_4 <- str_glue("scripts/{file_names[47:68]}") %>%
map_dfr(read_csv)
# season_5 <- str_glue("scripts/{file_names[69:90]}") %>%
# map_dfr(read_csv)
#
# season_6 <- str_glue("scripts/{file_names[91:110]}") %>%
# map_dfr(read_csv)
#
# season_7 <- str_glue("scripts/{file_names[111:length(file_names)]}") %>%
# map_dfr(read_csv)
parks_afinn <- parks_token %>%
inner_join(get_sentiments("afinn")) %>%
drop_na(value) %>%
mutate(index = seq(1, length(word) ,1)) %>%
mutate(moving_avg = as.numeric(slide(value,
mean,
.before = (151 - 1)/2 ,
.after = (151 - 1)/2 ))) %>%
mutate(neg_pos = factor(case_when(
moving_avg > 0 ~ "Positive",
moving_avg <= 0 ~ "Negative"
)))
sent_plot <- ggplot(data = parks_afinn, aes(x = index, y = moving_avg)) +
geom_col(aes(fill = neg_pos)) +
scale_fill_manual(values = c("Positive" = "springgreen2",
"Negative" = "darkred"))+
theme_minimal() +
labs(x = "Index",
y = "Moving Average sentiment",
fill = "") +
theme(panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_text(size = 11,
face = "bold",
color = "white"),
axis.title.y = element_text(color = "white",
size = 12,
face = "bold"),
axis.title.x = element_blank(),
panel.grid.minor = element_blank(),
plot.background = element_rect(fill = "#222222",
color = "#222222"),
strip.background = element_rect(fill = "#222222",
color = "#222222"),
legend.text = element_text(color = "white",
size = 11,
face = "bold"))
# Make it interactive:
# ggplotly(sent_plot, tooltip = c("index", "character", "word", "value"))
sent_plot
# Tokenize lines to one word in each row
season_token <- season_4 %>%
clean_names() %>%
unnest_tokens(word, line) %>% # tokenize
anti_join(stop_words) %>% # remove stop words
mutate(word = str_extract(word, "[a-z']+")) %>% # extract words only
drop_na(word) # take out missing values
season_afinn <- season_token %>%
inner_join(get_sentiments("afinn")) %>%
drop_na(value) %>%
mutate(index = seq(1, length(word) ,1)) %>%
mutate(moving_avg = as.numeric(slide(value,
mean,
.before = (51 - 1)/2 ,
.after = (51 - 1)/2 )))
season_plot <- ggplot(data = season_afinn, aes(x = index, y = moving_avg)) +
geom_col(aes(fill = moving_avg)) +
scale_fill_distiller(type = "div",
palette = "BrBG")+
theme_minimal() +
labs(x = "Index",
y = "AFINN sentiment",
fill = "") +
theme(panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_text(size = 11,
face = "bold",
color = "white"),
axis.title.y = element_text(color = "white",
size = 12,
face = "bold"),
axis.title.x = element_blank(),
panel.grid.minor = element_blank(),
plot.background = element_rect(fill = "#222222",
color = "#222222"),
strip.background = element_rect(fill = "#222222",
color = "#222222"),
legend.text = element_text(color = "white",
size = 11,
face = "bold"))
# Make it interactive:
# ggplotly(sent_plot, tooltip = c("index", "character", "word", "value"))
season_plot
characters_sent <- parks_token %>%
inner_join(top_characters) %>%
filter(!word %in% c("hey", "yeah", "gonna")) %>%
select(-n) %>%
inner_join(get_sentiments("nrc")) %>%
count(sentiment, character, sort = TRUE)
ggplot(data = characters_sent,
aes(x = n, y = sentiment, fill = n)) +
geom_col(show.legend = FALSE) +
scale_fill_viridis_c(option = "plasma") +
facet_wrap(~character, scales = "free") +
theme_brooklyn99() +
theme(panel.grid.major.y = element_blank(),
axis.text.x = element_text(size = 9),
axis.text.y = element_text(size = 10),
axis.title = element_blank(),
panel.grid.minor = element_blank(),
strip.text = element_text(color = "white",
face = "bold",
size = 10.5))